{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Server Machine Dataset (SMD) from OmniAnomaly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "from typing import Final\n",
    "from collections.abc import Callable\n",
    "from config import data_raw_folder, data_processed_folder\n",
    "from timeeval import Datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking for source datasets in /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset and\n",
      "saving processed datasets in /home/projects/akita/data/benchmark-data/data-processed\n"
     ]
    }
   ],
   "source": [
    "dataset_collection_name = \"SMD\"\n",
    "source_folder = os.path.join(data_raw_folder, \"Server Machine Dataset\")\n",
    "target_folder = data_processed_folder\n",
    "\n",
    "from pathlib import Path\n",
    "print(f\"Looking for source datasets in {Path(source_folder).absolute()} and\\nsaving processed datasets in {Path(target_folder).absolute()}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "file handling and transformations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def list_regular_files(path: str) -> list[str]:\n",
    "    return [f for f in os.listdir(path)if os.path.isfile(os.path.join(path, f))]\n",
    "\n",
    "def get_source_path(file, tpe=\"train\"):\n",
    "    return os.path.join(source_folder, tpe, file)\n",
    "\n",
    "def calc_size(filename: str) -> int:\n",
    "    with open(filename, 'r') as f:\n",
    "        c = 0\n",
    "        for line in f:\n",
    "            c += 1\n",
    "    return c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def transform_and_label(source_file: str, target: str, tpe: str) -> int:\n",
    "    df = pd.read_csv(get_source_path(source_file, tpe), header=None)\n",
    "    df.index.name = \"timestamp\"\n",
    "    df.columns = list(map(lambda v: f\"value-{v}\", df.columns))\n",
    "\n",
    "    if tpe == \"test\":\n",
    "        df_label = pd.read_csv(get_source_path(source_file, \"test_label\"), header=None)\n",
    "        df_label.columns=[\"is_anomaly\"]\n",
    "        df = pd.merge(df, df_label, left_index=True, right_index=True, how=\"inner\")\n",
    "    else:\n",
    "        df[\"is_anomaly\"] = 0\n",
    "\n",
    "    df.to_csv(target)\n",
    "    return len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Directories /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD already exist\n"
     ]
    }
   ],
   "source": [
    "# shared by all datasets\n",
    "dataset_type = \"real\"\n",
    "train_is_normal = True\n",
    "train_type = \"semi-supervised\"\n",
    "input_type = \"multivariate\"\n",
    "datetime_index = False\n",
    "\n",
    "# create target directory\n",
    "dataset_subfolder = os.path.join(input_type, dataset_collection_name)\n",
    "target_subfolder = os.path.join(target_folder, dataset_subfolder)\n",
    "try:\n",
    "    os.makedirs(target_subfolder)\n",
    "    print(f\"Created directories {target_subfolder}\")\n",
    "except FileExistsError:\n",
    "    print(f\"Directories {target_subfolder} already exist\")\n",
    "    pass\n",
    "\n",
    "dm = Datasets(target_folder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-3-6.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-6.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-3-6.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-6.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-1-8.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-1-8.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-1-8.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-1-8.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-1-1.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-1-1.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-1-1.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-1-1.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-3-9.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-9.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-3-9.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-9.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-3-10.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-10.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-3-10.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-10.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-3-7.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-7.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-3-7.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-7.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-1-2.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-1-2.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-1-2.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-1-2.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-2-1.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-2-1.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-2-1.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-2-1.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-2-5.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-2-5.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-2-5.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-2-5.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-2-7.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-2-7.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-2-7.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-2-7.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-2-2.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-2-2.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-2-2.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-2-2.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-3-4.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-4.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-3-4.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-4.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-3-2.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-2.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-3-2.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-2.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-2-3.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-2-3.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-2-3.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-2-3.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-2-4.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-2-4.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-2-4.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-2-4.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-2-6.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-2-6.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-2-6.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-2-6.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-3-5.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-5.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-3-5.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-5.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-3-1.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-1.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-3-1.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-1.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-2-9.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-2-9.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-2-9.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-2-9.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-1-6.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-1-6.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-1-6.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-1-6.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-3-8.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-8.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-3-8.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-8.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-3-3.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-3.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-3-3.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-3.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-2-8.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-2-8.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-2-8.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-2-8.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-1-5.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-1-5.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-1-5.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-1-5.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-3-11.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-11.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-3-11.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-3-11.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-1-4.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-1-4.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-1-4.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-1-4.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-1-3.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-1-3.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-1-3.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-1-3.test.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/train/machine-1-7.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-1-7.train.csv\n",
      "Processed source dataset /home/projects/akita/data/benchmark-data/data-raw/Server Machine Dataset/test/machine-1-7.txt -> /home/projects/akita/data/benchmark-data/data-processed/multivariate/SMD/machine-1-7.test.csv\n"
     ]
    }
   ],
   "source": [
    "# dataset transformation\n",
    "transform_file: Callable[[str, str, str], int] = transform_and_label\n",
    "\n",
    "for f in list_regular_files(get_source_path(\".\")):\n",
    "    paths = {}\n",
    "    for t_type in [\"train\", \"test\"]:\n",
    "        dataset_name = os.path.splitext(f)[0]\n",
    "        source_file = get_source_path(f, t_type)\n",
    "        filename = f\"{dataset_name}.{t_type}.csv\"\n",
    "        path = os.path.join(dataset_subfolder, filename)\n",
    "        target_filepath = os.path.join(target_subfolder, filename)\n",
    "        paths[t_type] = path\n",
    "        \n",
    "        # transform file\n",
    "        dataset_length = transform_file(f, target_filepath, t_type)\n",
    "        print(f\"Processed source dataset {source_file} -> {target_filepath}\")\n",
    "\n",
    "    # save metadata\n",
    "    dm.add_dataset((dataset_collection_name, dataset_name),\n",
    "        train_path = paths[\"train\"],\n",
    "        test_path = paths[\"test\"],\n",
    "        dataset_type = dataset_type,\n",
    "        datetime_index = datetime_index,\n",
    "        split_at = None,\n",
    "        train_type = train_type,\n",
    "        train_is_normal = train_is_normal,\n",
    "        input_type = input_type,\n",
    "        dataset_length = dataset_length\n",
    "    )\n",
    "\n",
    "# save metadata of benchmark\n",
    "dm.save()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>train_path</th>\n",
       "      <th>test_path</th>\n",
       "      <th>dataset_type</th>\n",
       "      <th>datetime_index</th>\n",
       "      <th>split_at</th>\n",
       "      <th>train_type</th>\n",
       "      <th>train_is_normal</th>\n",
       "      <th>input_type</th>\n",
       "      <th>length</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>collection_name</th>\n",
       "      <th>dataset_name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"28\" valign=\"top\">SMD</th>\n",
       "      <th>machine-1-1</th>\n",
       "      <td>multivariate/SMD/machine-1-1.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-1-1.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>28479</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-1-2</th>\n",
       "      <td>multivariate/SMD/machine-1-2.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-1-2.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>23694</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-1-3</th>\n",
       "      <td>multivariate/SMD/machine-1-3.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-1-3.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>23703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-1-4</th>\n",
       "      <td>multivariate/SMD/machine-1-4.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-1-4.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>23707</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-1-5</th>\n",
       "      <td>multivariate/SMD/machine-1-5.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-1-5.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>23706</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-1-6</th>\n",
       "      <td>multivariate/SMD/machine-1-6.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-1-6.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>23689</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-1-7</th>\n",
       "      <td>multivariate/SMD/machine-1-7.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-1-7.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>23697</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-1-8</th>\n",
       "      <td>multivariate/SMD/machine-1-8.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-1-8.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>23699</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-2-1</th>\n",
       "      <td>multivariate/SMD/machine-2-1.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-2-1.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>23694</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-2-2</th>\n",
       "      <td>multivariate/SMD/machine-2-2.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-2-2.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>23700</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-2-3</th>\n",
       "      <td>multivariate/SMD/machine-2-3.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-2-3.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>23689</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-2-4</th>\n",
       "      <td>multivariate/SMD/machine-2-4.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-2-4.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>23689</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-2-5</th>\n",
       "      <td>multivariate/SMD/machine-2-5.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-2-5.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>23689</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-2-6</th>\n",
       "      <td>multivariate/SMD/machine-2-6.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-2-6.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>28743</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-2-7</th>\n",
       "      <td>multivariate/SMD/machine-2-7.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-2-7.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>23696</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-2-8</th>\n",
       "      <td>multivariate/SMD/machine-2-8.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-2-8.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>23703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-2-9</th>\n",
       "      <td>multivariate/SMD/machine-2-9.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-2-9.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>28722</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-3-1</th>\n",
       "      <td>multivariate/SMD/machine-3-1.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-3-1.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>28700</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-3-10</th>\n",
       "      <td>multivariate/SMD/machine-3-10.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-3-10.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>23693</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-3-11</th>\n",
       "      <td>multivariate/SMD/machine-3-11.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-3-11.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>28696</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-3-2</th>\n",
       "      <td>multivariate/SMD/machine-3-2.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-3-2.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>23703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-3-3</th>\n",
       "      <td>multivariate/SMD/machine-3-3.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-3-3.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>23703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-3-4</th>\n",
       "      <td>multivariate/SMD/machine-3-4.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-3-4.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>23687</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-3-5</th>\n",
       "      <td>multivariate/SMD/machine-3-5.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-3-5.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>23691</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-3-6</th>\n",
       "      <td>multivariate/SMD/machine-3-6.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-3-6.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>28726</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-3-7</th>\n",
       "      <td>multivariate/SMD/machine-3-7.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-3-7.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>28705</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-3-8</th>\n",
       "      <td>multivariate/SMD/machine-3-8.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-3-8.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>28704</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>machine-3-9</th>\n",
       "      <td>multivariate/SMD/machine-3-9.train.csv</td>\n",
       "      <td>multivariate/SMD/machine-3-9.test.csv</td>\n",
       "      <td>real</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>semi-supervised</td>\n",
       "      <td>True</td>\n",
       "      <td>multivariate</td>\n",
       "      <td>28713</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                           train_path  \\\n",
       "collection_name dataset_name                                            \n",
       "SMD             machine-1-1    multivariate/SMD/machine-1-1.train.csv   \n",
       "                machine-1-2    multivariate/SMD/machine-1-2.train.csv   \n",
       "                machine-1-3    multivariate/SMD/machine-1-3.train.csv   \n",
       "                machine-1-4    multivariate/SMD/machine-1-4.train.csv   \n",
       "                machine-1-5    multivariate/SMD/machine-1-5.train.csv   \n",
       "                machine-1-6    multivariate/SMD/machine-1-6.train.csv   \n",
       "                machine-1-7    multivariate/SMD/machine-1-7.train.csv   \n",
       "                machine-1-8    multivariate/SMD/machine-1-8.train.csv   \n",
       "                machine-2-1    multivariate/SMD/machine-2-1.train.csv   \n",
       "                machine-2-2    multivariate/SMD/machine-2-2.train.csv   \n",
       "                machine-2-3    multivariate/SMD/machine-2-3.train.csv   \n",
       "                machine-2-4    multivariate/SMD/machine-2-4.train.csv   \n",
       "                machine-2-5    multivariate/SMD/machine-2-5.train.csv   \n",
       "                machine-2-6    multivariate/SMD/machine-2-6.train.csv   \n",
       "                machine-2-7    multivariate/SMD/machine-2-7.train.csv   \n",
       "                machine-2-8    multivariate/SMD/machine-2-8.train.csv   \n",
       "                machine-2-9    multivariate/SMD/machine-2-9.train.csv   \n",
       "                machine-3-1    multivariate/SMD/machine-3-1.train.csv   \n",
       "                machine-3-10  multivariate/SMD/machine-3-10.train.csv   \n",
       "                machine-3-11  multivariate/SMD/machine-3-11.train.csv   \n",
       "                machine-3-2    multivariate/SMD/machine-3-2.train.csv   \n",
       "                machine-3-3    multivariate/SMD/machine-3-3.train.csv   \n",
       "                machine-3-4    multivariate/SMD/machine-3-4.train.csv   \n",
       "                machine-3-5    multivariate/SMD/machine-3-5.train.csv   \n",
       "                machine-3-6    multivariate/SMD/machine-3-6.train.csv   \n",
       "                machine-3-7    multivariate/SMD/machine-3-7.train.csv   \n",
       "                machine-3-8    multivariate/SMD/machine-3-8.train.csv   \n",
       "                machine-3-9    multivariate/SMD/machine-3-9.train.csv   \n",
       "\n",
       "                                                           test_path  \\\n",
       "collection_name dataset_name                                           \n",
       "SMD             machine-1-1    multivariate/SMD/machine-1-1.test.csv   \n",
       "                machine-1-2    multivariate/SMD/machine-1-2.test.csv   \n",
       "                machine-1-3    multivariate/SMD/machine-1-3.test.csv   \n",
       "                machine-1-4    multivariate/SMD/machine-1-4.test.csv   \n",
       "                machine-1-5    multivariate/SMD/machine-1-5.test.csv   \n",
       "                machine-1-6    multivariate/SMD/machine-1-6.test.csv   \n",
       "                machine-1-7    multivariate/SMD/machine-1-7.test.csv   \n",
       "                machine-1-8    multivariate/SMD/machine-1-8.test.csv   \n",
       "                machine-2-1    multivariate/SMD/machine-2-1.test.csv   \n",
       "                machine-2-2    multivariate/SMD/machine-2-2.test.csv   \n",
       "                machine-2-3    multivariate/SMD/machine-2-3.test.csv   \n",
       "                machine-2-4    multivariate/SMD/machine-2-4.test.csv   \n",
       "                machine-2-5    multivariate/SMD/machine-2-5.test.csv   \n",
       "                machine-2-6    multivariate/SMD/machine-2-6.test.csv   \n",
       "                machine-2-7    multivariate/SMD/machine-2-7.test.csv   \n",
       "                machine-2-8    multivariate/SMD/machine-2-8.test.csv   \n",
       "                machine-2-9    multivariate/SMD/machine-2-9.test.csv   \n",
       "                machine-3-1    multivariate/SMD/machine-3-1.test.csv   \n",
       "                machine-3-10  multivariate/SMD/machine-3-10.test.csv   \n",
       "                machine-3-11  multivariate/SMD/machine-3-11.test.csv   \n",
       "                machine-3-2    multivariate/SMD/machine-3-2.test.csv   \n",
       "                machine-3-3    multivariate/SMD/machine-3-3.test.csv   \n",
       "                machine-3-4    multivariate/SMD/machine-3-4.test.csv   \n",
       "                machine-3-5    multivariate/SMD/machine-3-5.test.csv   \n",
       "                machine-3-6    multivariate/SMD/machine-3-6.test.csv   \n",
       "                machine-3-7    multivariate/SMD/machine-3-7.test.csv   \n",
       "                machine-3-8    multivariate/SMD/machine-3-8.test.csv   \n",
       "                machine-3-9    multivariate/SMD/machine-3-9.test.csv   \n",
       "\n",
       "                             dataset_type  datetime_index  split_at  \\\n",
       "collection_name dataset_name                                          \n",
       "SMD             machine-1-1          real           False       NaN   \n",
       "                machine-1-2          real           False       NaN   \n",
       "                machine-1-3          real           False       NaN   \n",
       "                machine-1-4          real           False       NaN   \n",
       "                machine-1-5          real           False       NaN   \n",
       "                machine-1-6          real           False       NaN   \n",
       "                machine-1-7          real           False       NaN   \n",
       "                machine-1-8          real           False       NaN   \n",
       "                machine-2-1          real           False       NaN   \n",
       "                machine-2-2          real           False       NaN   \n",
       "                machine-2-3          real           False       NaN   \n",
       "                machine-2-4          real           False       NaN   \n",
       "                machine-2-5          real           False       NaN   \n",
       "                machine-2-6          real           False       NaN   \n",
       "                machine-2-7          real           False       NaN   \n",
       "                machine-2-8          real           False       NaN   \n",
       "                machine-2-9          real           False       NaN   \n",
       "                machine-3-1          real           False       NaN   \n",
       "                machine-3-10         real           False       NaN   \n",
       "                machine-3-11         real           False       NaN   \n",
       "                machine-3-2          real           False       NaN   \n",
       "                machine-3-3          real           False       NaN   \n",
       "                machine-3-4          real           False       NaN   \n",
       "                machine-3-5          real           False       NaN   \n",
       "                machine-3-6          real           False       NaN   \n",
       "                machine-3-7          real           False       NaN   \n",
       "                machine-3-8          real           False       NaN   \n",
       "                machine-3-9          real           False       NaN   \n",
       "\n",
       "                                   train_type  train_is_normal    input_type  \\\n",
       "collection_name dataset_name                                                   \n",
       "SMD             machine-1-1   semi-supervised             True  multivariate   \n",
       "                machine-1-2   semi-supervised             True  multivariate   \n",
       "                machine-1-3   semi-supervised             True  multivariate   \n",
       "                machine-1-4   semi-supervised             True  multivariate   \n",
       "                machine-1-5   semi-supervised             True  multivariate   \n",
       "                machine-1-6   semi-supervised             True  multivariate   \n",
       "                machine-1-7   semi-supervised             True  multivariate   \n",
       "                machine-1-8   semi-supervised             True  multivariate   \n",
       "                machine-2-1   semi-supervised             True  multivariate   \n",
       "                machine-2-2   semi-supervised             True  multivariate   \n",
       "                machine-2-3   semi-supervised             True  multivariate   \n",
       "                machine-2-4   semi-supervised             True  multivariate   \n",
       "                machine-2-5   semi-supervised             True  multivariate   \n",
       "                machine-2-6   semi-supervised             True  multivariate   \n",
       "                machine-2-7   semi-supervised             True  multivariate   \n",
       "                machine-2-8   semi-supervised             True  multivariate   \n",
       "                machine-2-9   semi-supervised             True  multivariate   \n",
       "                machine-3-1   semi-supervised             True  multivariate   \n",
       "                machine-3-10  semi-supervised             True  multivariate   \n",
       "                machine-3-11  semi-supervised             True  multivariate   \n",
       "                machine-3-2   semi-supervised             True  multivariate   \n",
       "                machine-3-3   semi-supervised             True  multivariate   \n",
       "                machine-3-4   semi-supervised             True  multivariate   \n",
       "                machine-3-5   semi-supervised             True  multivariate   \n",
       "                machine-3-6   semi-supervised             True  multivariate   \n",
       "                machine-3-7   semi-supervised             True  multivariate   \n",
       "                machine-3-8   semi-supervised             True  multivariate   \n",
       "                machine-3-9   semi-supervised             True  multivariate   \n",
       "\n",
       "                              length  \n",
       "collection_name dataset_name          \n",
       "SMD             machine-1-1    28479  \n",
       "                machine-1-2    23694  \n",
       "                machine-1-3    23703  \n",
       "                machine-1-4    23707  \n",
       "                machine-1-5    23706  \n",
       "                machine-1-6    23689  \n",
       "                machine-1-7    23697  \n",
       "                machine-1-8    23699  \n",
       "                machine-2-1    23694  \n",
       "                machine-2-2    23700  \n",
       "                machine-2-3    23689  \n",
       "                machine-2-4    23689  \n",
       "                machine-2-5    23689  \n",
       "                machine-2-6    28743  \n",
       "                machine-2-7    23696  \n",
       "                machine-2-8    23703  \n",
       "                machine-2-9    28722  \n",
       "                machine-3-1    28700  \n",
       "                machine-3-10   23693  \n",
       "                machine-3-11   28696  \n",
       "                machine-3-2    23703  \n",
       "                machine-3-3    23703  \n",
       "                machine-3-4    23687  \n",
       "                machine-3-5    23691  \n",
       "                machine-3-6    28726  \n",
       "                machine-3-7    28705  \n",
       "                machine-3-8    28704  \n",
       "                machine-3-9    28713  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dm.refresh()\n",
    "dm.df().loc[(slice(dataset_collection_name,dataset_collection_name), slice(None))]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Experimentation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "train_folder = os.path.join(source_folder, \"train\")\n",
    "for f in list_regular_files(get_source_path(\".\")):\n",
    "    for p in [\"train\", \"test\"]:\n",
    "        file = get_source_path(f, p)\n",
    "        filename = f\"{os.path.splitext(f)[0]}.{p}.csv\"\n",
    "        if p == \"test\":\n",
    "            file = file + \" & \" + get_source_path(f, \"test_label\")\n",
    "        print(p, \":\", file, \"->\", os.path.join(dataset_subfolder, filename))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(get_source_path(\"machine-1-1.txt\", \"test\"), header=None)\n",
    "df.index.name = \"timestamp\"\n",
    "df.columns = list(map(lambda v: f\"value-{v}\", df.columns))\n",
    "df\n",
    "\n",
    "df_label = pd.read_csv(get_source_path(\"machine-1-1.txt\", \"test_label\"), header=None)\n",
    "df_label.columns=[\"is_anomaly\"]\n",
    "df = pd.merge(df, df_label, left_index=True, right_index=True, how=\"inner\")\n",
    "#df.to_csv(\"test.csv\")\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.read_csv(os.path.join(target_folder, input_type, dataset_collection_name, \"machine-3-11.train.csv\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "timeeval",
   "language": "python",
   "name": "timeeval"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
